필요한 패키지와 라이브러리 다운로드
#install.packages('plotly')
#install.packages('ggthemes')
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(plotly)
##
## 다음의 패키지를 부착합니다: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(data.table)
##
## 다음의 패키지를 부착합니다: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library(ggplot2)
library(ggthemes)
library(dplyr)
CSV를 데이터프레임에 가져오기
test <- read.csv('test.csv', na.strings = c(''))
train <- read.csv('train.csv', na.strings = c(''))
결측치 및 이상치 확인
colSums(is.na(test))
## PassengerId Pclass Name Sex Age SibSp
## 0 0 0 0 86 0
## Parch Ticket Fare Cabin Embarked
## 0 0 1 327 0
colSums(is.na(train))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 177
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 687 2
Family Size라는 새로운 Column 추가
test$FamilySize <- test$SibSp + test$Parch + 1 #형제+부모+자신
train$FamilySize <- train$SibSp + train$Parch + 1
Fare의 결측치는 평균으로 대체
test$Fare[is.na(test$Fare)] <- mean(test$Fare, na.rm = TRUE)
Embarked의 결측치는 다수값으로 대체
table(train$Embarked) #S가 가장 많다
##
## C Q S
## 168 77 644
train$Embarked[is.na(train$Embarked)] <- 'S'
train$SName <- gsub('(.*, )|(\\..*)','',train$Name)
table(train$Sex, train$SName)
##
## Capt Col Don Dr Jonkheer Lady Major Master Miss Mlle Mme Mr Mrs Ms
## female 0 0 0 1 0 1 0 0 182 2 1 0 125 1
## male 1 2 1 6 1 0 2 40 0 0 0 517 0 0
##
## Rev Sir the Countess
## female 0 0 1
## male 6 1 0
ect <- c('Capt','Col','Don','Dr','Jonkheer','the Countess',
'Lady','Major','Mlle','Mme','Rev','Sir','Dona')
train$SName[train$SName %in% ect] <- 'Others'
table(train$Sex, train$SName)
##
## Master Miss Mr Mrs Ms Others
## female 0 182 0 125 1 6
## male 40 0 517 0 0 20
m_M <- mean(train$Age[train$SName=='Master'], na.rm = TRUE)
m_Mr <- mean(train$Age[train$SName=='Mr'], na.rm = TRUE)
m_Mrs <- mean(train$Age[train$SName=='Mrs'], na.rm = TRUE)
m_Mis <- mean(train$Age[train$SName=='Miss'], na.rm = TRUE)
m_Ms <- mean(train$Age[train$SName=='Ms'], na.rm = TRUE)
m_O <- mean(train$Age[train$SName=='Others'], na.rm = TRUE)
train$Age <- ifelse(train$SName=='Master', ifelse(is.na(train$Age), m_M, train$Age), train$Age)
train$Age <- ifelse(train$SName=='Mr', ifelse(is.na(train$Age), m_Mr, train$Age), train$Age)
train$Age <- ifelse(train$SName=='Mrs', ifelse(is.na(train$Age), m_Mrs, train$Age), train$Age)
train$Age <- ifelse(train$SName=='Miss', ifelse(is.na(train$Age), m_Mis, train$Age), train$Age)
train$Age <- ifelse(train$SName=='Ms', ifelse(is.na(train$Age), m_Ms, train$Age), train$Age)
train$Age <- ifelse(train$SName=='Others', ifelse(is.na(train$Age), m_O, train$Age), train$Age)
test$SName <- gsub('(.*, )|(\\..*)','',test$Name)
test$SName[test$SName %in% ect] <- 'Others'
M <- mean(test$Age[test$SName=='Master'], na.rm = TRUE)
Mr <- mean(test$Age[test$SName=='Mr'], na.rm = TRUE)
Mrs <- mean(test$Age[test$SName=='Mrs'], na.rm = TRUE)
Mis <- mean(test$Age[test$SName=='Miss'], na.rm = TRUE)
Ms <- mean(test$Age[test$SName=='Ms'], na.rm = TRUE)
O <- mean(test$Age[test$SName=='Others'], na.rm = TRUE)
test$Age <- ifelse(test$SName=='Master', ifelse(is.na(test$Age), M, test$Age), test$Age)
test$Age <- ifelse(test$SName=='Mr', ifelse(is.na(test$Age), Mr, test$Age), test$Age)
test$Age <- ifelse(test$SName=='Mrs', ifelse(is.na(test$Age), Mrs, test$Age), test$Age)
test$Age <- ifelse(test$SName=='Miss', ifelse(is.na(test$Age), Mis, test$Age), test$Age)
test$Age <- ifelse(test$SName=='Ms', ifelse(is.na(test$Age), Ms, test$Age), test$Age)
test$Age <- ifelse(test$SName=='Others', ifelse(is.na(test$Age), O, test$Age), test$Age)
필요없는 열 제거
train <- subset(train, select=-c(Name, Cabin, Ticket))
test <- subset(test, select=-c(Name, Cabin, Ticket))
각 데이터 셋의 라벨 변환
# train dataset
train$FamilySize = cut(train$FamilySize, c(0, 1, 4, 15),
include.lowest = TRUE)
levels(train$FamilySize) = c("single", "small family(2~4)", "big family(5+)")
levels(train$Survived) = c('Not Survived','Survived')
# test dataset
test$FamilySize = cut(test$FamilySize, c(0, 1, 4, 15),
include.lowest = TRUE)
levels(test$FamilySize) = c("single", "small family(2~4)", "big family(5+)")
타입변환 : 범주형데이터(factor)
# train dataset
train <- train %>%
dplyr::mutate(Survived = factor(Survived),
Pclass = factor(Pclass,ordered = TRUE),
SName = factor(SName),
Sex = factor(Sex),
FamilySize = factor(FamilySize),
Embarked = factor(Embarked))
# test dataset
test <- test %>%
dplyr::mutate(Pclass = factor(Pclass,ordered = TRUE),
SName = factor(SName),
Sex = factor(Sex),
FamilySize = factor(FamilySize),
Embarked = factor(Embarked))
##EDA
#1 Fare
p1 <- plot_ly(type='box',
data=train,
x=~Survived,
y=~Fare,
color=~Survived,
alpha = 0.3
)%>%
layout(title='Survivor by Fare')
p1
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
#group_by(train$Embarked)
#2 Embarked
p2 <- train %>%
ggplot(aes(Embarked,fill = Survived)) +
geom_bar(stat='count',position = "fill", alpha=0.85) +
scale_fill_brewer(palette = "Paired") +
theme_minimal()+
#scale_y_continuous(labels = percent) +
labs(x = "Embarked", y = "Survival Rate",
title = "Survival by Embarked",
fill = 'Survival')
p2
#3 family size
p3 <- train %>%
ggplot(aes(FamilySize,fill = Survived)) +
geom_bar(stat='count',position = "fill", alpha=0.85) +
scale_fill_brewer(palette = "Paired") +
theme_minimal()+
#scale_y_continuous(labels = percent) +
labs(x = "FamilySize", y = "Survival Rate",
title = "Survival by FamilySize",
fill = 'Survival')
p3
#4 Age, Sex
p4 <- train %>%
filter(Survived=='1') %>% #생존자들 분포
ggplot(aes(x=Age, fill=Sex)) +
geom_density(alpha=0.4)+
theme_minimal()+
labs(title = "Survival Distribution")
p4
p5 <- train %>%
filter(Survived=='0') %>% #사망자들 분포
ggplot(aes(x=Age, fill=Sex)) +
geom_density(alpha=0.4)+
theme_minimal()+
labs(title = "Non-Survival Distribution")
p5
###로지스틱선형 회귀
#모델
levels(train$Survived) = c(0,1) #복구,, 필요한가?
#View(train)
model <- glm(Survived ~., family = binomial(link=logit), data=train)
model
##
## Call: glm(formula = Survived ~ ., family = binomial(link = logit),
## data = train)
##
## Coefficients:
## (Intercept) PassengerId
## 1.892e+01 2.043e-05
## Pclass.L Pclass.Q
## -1.472e+00 7.241e-02
## Sexmale Age
## -1.586e+01 -2.579e-02
## SibSp Parch
## -7.310e-02 8.344e-02
## Fare EmbarkedQ
## 3.922e-03 3.175e-02
## EmbarkedS FamilySizesmall family(2~4)
## -3.065e-01 -3.399e-01
## FamilySizebig family(5+) SNameMiss
## -3.048e+00 -1.651e+01
## SNameMr SNameMrs
## -3.515e+00 -1.578e+01
## SNameMs SNameOthers
## -2.331e+00 -3.544e+00
##
## Degrees of Freedom: 890 Total (i.e. Null); 873 Residual
## Null Deviance: 1187
## Residual Deviance: 710.7 AIC: 746.7
summary(model)
##
## Call:
## glm(formula = Survived ~ ., family = binomial(link = logit),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7021 -0.5322 -0.3847 0.5505 2.4268
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.892e+01 5.804e+02 0.033 0.97399
## PassengerId 2.043e-05 3.739e-04 0.055 0.95643
## Pclass.L -1.472e+00 2.296e-01 -6.411 1.45e-10 ***
## Pclass.Q 7.241e-02 2.026e-01 0.357 0.72076
## Sexmale -1.586e+01 5.804e+02 -0.027 0.97820
## Age -2.579e-02 9.757e-03 -2.643 0.00822 **
## SibSp -7.310e-02 2.141e-01 -0.341 0.73279
## Parch 8.344e-02 2.182e-01 0.382 0.70219
## Fare 3.922e-03 2.678e-03 1.465 0.14297
## EmbarkedQ 3.175e-02 4.031e-01 0.079 0.93722
## EmbarkedS -3.065e-01 2.573e-01 -1.191 0.23364
## FamilySizesmall family(2~4) -3.399e-01 3.747e-01 -0.907 0.36428
## FamilySizebig family(5+) -3.048e+00 1.103e+00 -2.762 0.00575 **
## SNameMiss -1.651e+01 5.804e+02 -0.028 0.97730
## SNameMr -3.515e+00 6.026e-01 -5.832 5.47e-09 ***
## SNameMrs -1.578e+01 5.804e+02 -0.027 0.97831
## SNameMs -2.331e+00 1.567e+03 -0.001 0.99881
## SNameOthers -3.544e+00 8.302e-01 -4.270 1.96e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1186.66 on 890 degrees of freedom
## Residual deviance: 710.73 on 873 degrees of freedom
## AIC: 746.73
##
## Number of Fisher Scoring iterations: 14
anova(model, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Survived
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 890 1186.66
## PassengerId 1 0.022 889 1186.63 0.8812
## Pclass 2 103.808 887 1082.82 < 2.2e-16 ***
## Sex 1 256.002 886 826.82 < 2.2e-16 ***
## Age 1 24.813 885 802.01 6.318e-07 ***
## SibSp 1 16.414 884 785.60 5.091e-05 ***
## Parch 1 0.496 883 785.10 0.4812
## Fare 1 1.529 882 783.57 0.2163
## Embarked 2 3.775 880 779.80 0.1514
## FamilySize 2 20.143 878 759.65 4.228e-05 ***
## SName 5 48.923 873 710.73 2.301e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 예측
result <- predict(model,newdata=test,type='response')
result <- ifelse(result > 0.5, 1, 0)
result
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 0 1 1 1 1 0 1 0 0 0 0 0 1 1 0 0 1 1 0 0
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 0 0 0 1 1 1 1 0 0 1 1 0 1 1 1 1 0 1 0 1
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 1 0 0 0 0 0 1 1 NA 1 1 0 1 0 1 0 1 0 1 0
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 1 1 1
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 0 1
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 1 1 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
## 0 0 1 0 1 0 1 0 0 0 0 0 1 0 1 0 1 1 0 1
## 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
## 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0
## 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240
## 1 0 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1
## 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
## 1 1 0 0 1 0 1 0 1 1 1 0 1 0 0 0 0 0 1 0
## 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280
## 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0
## 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300
## 1 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0
## 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
## 0 0 0 0 1 1 0 1 0 1 0 0 0 1 1 1 0 0 0 0
## 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340
## 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 0 1
## 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360
## 0 0 0 1 1 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1
## 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380
## 0 1 1 0 1 0 0 1 1 0 0 1 0 0 1 1 1 0 0 0
## 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400
## 0 0 1 1 0 1 0 0 0 0 1 1 1 0 0 1 0 1 0 0
## 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418
## 1 0 1 1 0 0 0 0 1 1 1 1 1 0 1 0 0 1